To compare the performance of varKode to Skmer, we will use leave-one-out cross validation: we remove one sample from the dataset, train a varKode model or make a skmer reference with the remaining samples, and then use the sample left out as query. We then record whether or not we correctly identify this sample in varKoder, and whether or not the closest sample with Skmer has the same identification.
For traditional barcodes, we assembled the genome of each sample, and then used BLAST to search for each of the traditional barcode genes. We recorded if we could find this gene in the assembly, coding as missing data if we could not. We then recorded whether the best BLAST hit for a sample was the correct species.
rm(list=ls())
library(tidyverse)
── Attaching core tidyverse packages ────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.2 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.3 ✔ tibble 3.2.1
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.2 ── Conflicts ──────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(future)
library(ggthemes)
library(patchwork)
library(cowplot)
Attaching package: ‘cowplot’
The following object is masked from ‘package:patchwork’:
align_plots
The following object is masked from ‘package:ggthemes’:
theme_map
The following object is masked from ‘package:lubridate’:
stamp
library(patchwork)
library(phytools)
Loading required package: ape
Attaching package: ‘ape’
The following object is masked from ‘package:dplyr’:
where
Loading required package: maps
Attaching package: ‘maps’
The following object is masked from ‘package:purrr’:
map
library(ape)
set.seed(14164)
For VarKoder, we used leave-one-out cross-validation to test the accuracy for family, genera, species in the joint Malpighiaceae-Chrysobalanaceae dataset. We used as input data varKodes produced from kmers of size 7 and 500Kbp to 200Mbp of data, or all of the data available if less than 200 Mbp. For each sample, we built a model using as input data from all other samples. Then we queried the sample left out, using as input the images generated from 500Kb to the total data available. Now we will summarize the results.
In this test, we used varKoder v0.8.0. Let’s process the results.
read_and_process_xval = function(infolder){
plan(multisession(workers = 12))
varkoder_results = list.files(infolder,
'predictions.csv',
recursive=T,
full.names = T) %>%
furrr::future_map_dfr(~read_csv(.x) %>% mutate(sample_id = as.character(sample_id))) %>%
select(-1) %>%
filter(str_detect(query_basepairs,'^0+[125]0+K$')) %>% #we will ignore queries that are not standardized sizes
rename(query_bp = query_basepairs) %>%
mutate(quality_included = T)
plan(sequential)
all_taxlabels = str_remove(varkoder_results$actual_labels,";*low_quality:True;*") %>% str_split(';') %>% unlist %>% unique
varkoder_results = varkoder_results %>%
mutate(query_labels = str_remove(actual_labels,";*low_quality:True;*") %>% str_split(';'),
predicted_list = str_split(predicted_labels,';')
) %>%
rowwise() %>%
mutate(family_correct = query_labels[str_detect(query_labels,'family')] %in% predicted_list,
genus_correct = query_labels[str_detect(query_labels,'genus')] %in% predicted_list,
species_correct = ifelse(any(str_detect(query_labels,'species')),
query_labels[str_detect(query_labels,'species')] %in% predicted_list,
NA
),
family_incorrect = any(!(predicted_list[str_detect(predicted_list,'family')] %in% query_labels[str_detect(query_labels,'family')])),
genus_incorrect = any(!(predicted_list[str_detect(predicted_list,'genus')] %in% query_labels[str_detect(query_labels,'genus')])),
species_incorrect = ifelse(any(str_detect(query_labels,'species')),
any(!(predicted_list[str_detect(predicted_list,'species')] %in% query_labels[str_detect(query_labels,'species')])),
NA
)
)
return(varkoder_results)
}
summarize_results = function(res,level){
res = res %>%
ungroup() %>%
mutate(low_quality = str_detect(actual_labels,"low_quality:True"),
result = as.character(ifelse(res[,str_c(level,'correct',sep='_')] & !res[,str_c(level,'incorrect',sep='_')], 'correct',
ifelse(res[,str_c(level,'correct',sep='_')] & res[,str_c(level,'incorrect',sep='_')], 'ambiguous',
ifelse(!res[,str_c(level,'correct',sep='_')] & res[,str_c(level,'incorrect',sep='_')], 'incorrect',
'inconclusive'
))))
) %>%
filter(!is.na(result)) %>%
group_by(query_bp,result) %>%
summarise(N=n(), .groups = 'drop') %>%
group_by(query_bp) %>%
mutate(p= N/sum(N)) %>%
mutate(query_bp = as.integer(str_remove(query_bp,'K'))*1000) %>%
ungroup() %>%
mutate(query_bp = as.factor(query_bp)) %>%
complete(query_bp,result, fill = list(p = 0, N = 0)) %>%
mutate(query_bp = as.numeric(as.character(query_bp))) %>%
ungroup()
return(res)
}
plot_area = function(sum_df, title, relative = FALSE, grid = TRUE, xlim_all = TRUE, wrap){
breaks = c(500000,
1000000,
2000000,
5000000,
10000000,
20000000,
50000000,
100000000,
200000000
)
if (xlim_all){
xlimits = range(breaks)
} else {
xlimits = range(sum_df$query_bp)
}
sum_df = sum_df %>%
mutate(result = factor(result,ordered = T, levels = c('correct','ambiguous','inconclusive','incorrect')))
if (relative){
ylimits = c(0,1)
} else {
ylimits = c(0,sum_df %>% group_by(query_bp) %>% summarize(N=sum(N)) %>% pull(N) %>% max)
}
# Get colors from a Color Brewer palette
brewer_colors <- RColorBrewer::brewer.pal(4, "Accent")
if (relative) {
p1 = ggplot(sum_df, aes(x=query_bp,y=p,fill=result)) +
geom_area(position='stack') +
scale_fill_manual(values = setNames(brewer_colors, c("correct", "ambiguous", "inconclusive", "incorrect"))) +
scale_alpha_manual(values=c(0.5,1)) +
scale_x_log10(labels = scales::label_number(scale_cut = scales::cut_si('bp')),breaks = breaks) +
scale_y_continuous() +
ggtitle(title) +
ylab('Fraction of samples') +
xlab('Base pairs in query images') +
theme_few() +
theme(axis.text.x = element_text(hjust=1,angle=45))
} else {
p1 = ggplot(sum_df, aes(x=query_bp,y=N,fill=result)) +
geom_area(position='stack') +
scale_fill_manual(values = setNames(brewer_colors, c("correct", "ambiguous", "inconclusive", "incorrect"))) +
scale_alpha_manual(values=c(0.5,1)) +
scale_x_log10(labels = scales::label_number(scale_cut = scales::cut_si('bp')),breaks = breaks) +
scale_y_continuous() +
ggtitle(title) +
ylab('Number of samples') +
xlab('Base pairs in query images') +
theme_few() +
theme(axis.text.x = element_text(hjust=1,angle=45))
}
if (grid){
p1 = p1 +
scale_y_continuous(n.breaks = 10, minor_breaks = waiver()) +
theme(panel.background = element_rect(fill = NA),
panel.grid.major.y = element_line(colour = gray(0.5)),
panel.grid.minor.y = element_line(colour = gray(0.6),linetype = 2),
panel.ontop = TRUE)
}
p1 = p1 + coord_cartesian(xlim=xlimits, ylim=ylimits,expand = FALSE)
if (!missing(wrap)) {
p1 = p1 + facet_wrap(as.formula(wrap))
}
return(p1)
}
Now let’s plot genus-level accuracy for a model taking quality labels into account:
results = read_and_process_xval('Malpighiaceae+Chrysobalanaceae/varKoder/vit_results/')
summary_genus = summarize_results(results,'genus')
p_genus = plot_area(summary_genus, 'varKoder genus', relative = TRUE)
p_genus
Now the same but with species
summary_species = summarize_results(results,'species')
p_species = plot_area(summary_species, 'varKoder species', relative = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_species
Finally, family
summary_family = summarize_results(results,'family')
p_family = plot_area(summary_family, 'varKoder family', relative = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_family
Now we will try to identify which samples failed and why they failed. Particuarly, how do DNA quality, amount of data, and the number of samples per class impact results? We will use genus-level predictions to test.
genus_predictions = results %>%
mutate(predicted_genus = str_extract(predicted_labels, 'genus:[^;]*'),
actual_genus = str_extract(actual_labels, 'genus:[^;]*')) %>%
select(-starts_with('family'),-starts_with('species')) %>%
pivot_longer(cols = starts_with("genus"), names_to = "predicted_label", values_to = "confidence") %>%
filter(actual_genus == predicted_label) %>%
select(query_bp, sample_id, basefrequency_sd, actual_genus, confidence) %>%
mutate(query_bp = 1000*(str_remove(query_bp, "K") %>% as.integer))
genus_predictions = genus_predictions %>%
select(sample_id, actual_genus) %>%
distinct() %>%
group_by(actual_genus) %>%
summarise(N_samples = n()) %>%
right_join(genus_predictions)
Joining with `by = join_by(actual_genus)`
genus_predictions %>% arrange(N_samples)
Now let’s make some plots. First, what is the effect of number of samples per class in confidence?
set.seed(13214526)
plot_genus_N_vs_conf = ggplot(genus_predictions, aes(x = N_samples-1,
y = confidence)) +
scale_color_viridis_c() +
geom_jitter(alpha=0.3) +
scale_x_log10() +
#ylab('Confidence in correct prediction\n(logit scale)') +
ylab('Confidence in correct genus prediction') +
xlab('Number of training samples in correct genus\n(log scale)') +
#scale_y_continuous(trans = "logit", breaks = c(1e-4,0.001,0.01,0.1,0.25,0.5,0.75,0.9,0.99,0.999,1-1e-4)) +
scale_y_continuous(limits=c(0,1)) +
theme_few() +
theme(panel.grid.major.y = element_line(colour = gray(0.8)))
plot_genus_N_vs_conf
Now, what is the effect of sample quality in confidence?
set.seed(13214526)
plot_genus_freqsd_vs_conf = ggplot(genus_predictions, aes(x = basefrequency_sd, y = confidence)) +
geom_point(alpha=0.3) +
scale_x_log10() +
#scale_y_continuous(trans = "logit", breaks = c(1e-4,0.001,0.01,0.1,0.25,0.5,0.75,0.9,0.99,0.999,1-1e-4)) +
scale_y_continuous(limits=c(0,1)) +
#ylab('Confidence in correct prediction\n(logit scale)') +
ylab('Confidence in correct genus prediction') +
xlab('Standard deviation of base frequencies') +
theme_few() +
theme(panel.grid.major.y = element_line(colour = gray(0.8)))
plot_genus_freqsd_vs_conf
Now, what is the effect of amount of data in confidence?
set.seed(13214526)
plot_genus_bp_vs_conf = ggplot(genus_predictions, aes(x = query_bp, y = confidence)) +
geom_jitter(alpha=0.3) +
#scale_y_continuous(trans = "logit", breaks = c(1e-4,0.001,0.01,0.1,0.25,0.5,0.75,0.9,0.99,0.999,1-1e-4)) +
scale_y_continuous(limits=c(0,1)) +
#ylab('Confidence in correct prediction\n(logit scale)') +
ylab('Confidence in correct genus prediction') +
xlab('Base pairs in query images\n(log scale)') +
scale_x_log10() +
theme_few() +
theme(panel.grid.major.y = element_line(colour = gray(0.8)))
plot_genus_bp_vs_conf
Let’s put it all together now in a linear model. We remove one from N_samples since one sample is used for validation in the leave-one-out cross-validation that we did.
lm_data = genus_predictions %>%
mutate(confidence = ifelse(confidence == 1, confidence-0.0000001, confidence),
confidence = car::logit(confidence)) %>%
mutate(query_bp = (query_bp - mean(query_bp))/sd(query_bp),
basefrequency_sd = (basefrequency_sd - mean(basefrequency_sd))/sd(basefrequency_sd),
N_samples = ((N_samples-1) - mean(N_samples-1))/sd(N_samples-1)
)
full_model = lm(formula = confidence~query_bp*basefrequency_sd*N_samples, data = lm_data)
full_model
Call:
lm(formula = confidence ~ query_bp * basefrequency_sd * N_samples,
data = lm_data)
Coefficients:
(Intercept) query_bp basefrequency_sd
4.98965 0.16528 -0.56243
N_samples query_bp:basefrequency_sd query_bp:N_samples
1.63728 0.21806 0.03515
basefrequency_sd:N_samples query_bp:basefrequency_sd:N_samples
0.05848 0.09003
reduced_model = step(full_model,direction = 'both')
Start: AIC=3363.26
confidence ~ query_bp * basefrequency_sd * N_samples
Df Sum of Sq RSS AIC
- query_bp:basefrequency_sd:N_samples 1 0.79515 9941.8 3361.4
<none> 9941.0 3363.3
Step: AIC=3361.44
confidence ~ query_bp + basefrequency_sd + N_samples + query_bp:basefrequency_sd +
query_bp:N_samples + basefrequency_sd:N_samples
Df Sum of Sq RSS AIC
- query_bp:N_samples 1 0.2369 9942.0 3359.5
- basefrequency_sd:N_samples 1 0.3082 9942.1 3359.5
- query_bp:basefrequency_sd 1 7.6355 9949.4 3361.2
<none> 9941.8 3361.4
+ query_bp:basefrequency_sd:N_samples 1 0.7952 9941.0 3363.3
Step: AIC=3359.49
confidence ~ query_bp + basefrequency_sd + N_samples + query_bp:basefrequency_sd +
basefrequency_sd:N_samples
Df Sum of Sq RSS AIC
- basefrequency_sd:N_samples 1 0.2658 9942.3 3357.6
- query_bp:basefrequency_sd 1 7.4113 9949.4 3359.2
<none> 9942.0 3359.5
+ query_bp:N_samples 1 0.2369 9941.8 3361.4
Step: AIC=3357.56
confidence ~ query_bp + basefrequency_sd + N_samples + query_bp:basefrequency_sd
Df Sum of Sq RSS AIC
- query_bp:basefrequency_sd 1 7.2 9949.5 3357.2
<none> 9942.3 3357.6
+ basefrequency_sd:N_samples 1 0.3 9942.0 3359.5
+ query_bp:N_samples 1 0.2 9942.1 3359.5
- N_samples 1 5618.2 15560.5 4367.5
Step: AIC=3357.2
confidence ~ query_bp + basefrequency_sd + N_samples
Df Sum of Sq RSS AIC
<none> 9949.5 3357.2
+ query_bp:basefrequency_sd 1 7.2 9942.3 3357.6
+ basefrequency_sd:N_samples 1 0.1 9949.4 3359.2
+ query_bp:N_samples 1 0.0 9949.5 3359.2
- query_bp 1 27.9 9977.4 3361.5
- basefrequency_sd 1 942.4 10891.9 3559.6
- N_samples 1 5611.2 15560.7 4365.5
summary(reduced_model)
Call:
lm(formula = confidence ~ query_bp + basefrequency_sd + N_samples,
data = lm_data)
Residuals:
Min 1Q Median 3Q Max
-13.9246 -0.9992 0.3124 1.3751 6.7283
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.96484 0.04419 112.340 <2e-16 ***
query_bp 0.11326 0.04506 2.513 0.012 *
basefrequency_sd -0.65645 0.04492 -14.615 <2e-16 ***
N_samples 1.62074 0.04545 35.661 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.101 on 2255 degrees of freedom
Multiple R-squared: 0.4425, Adjusted R-squared: 0.4418
F-statistic: 596.7 on 3 and 2255 DF, p-value: < 2.2e-16
plot(reduced_model)
Now let’s save the three of them as a single plot using cowplot.
preds = genus_predictions %>%
mutate(N_samples=N_samples-1) %>%
select(original_N = N_samples,
original_bp = query_bp,
original_sd = basefrequency_sd) %>%
mutate(query_bp = (original_bp - mean(original_bp))/sd(original_bp),
basefrequency_sd = (original_sd - mean(original_sd))/sd(original_sd),
N_samples = ((original_N) - mean(original_N))/sd(original_N)
)
preds = bind_rows(expand.grid(query_bp=unique(preds$query_bp),
N_samples=0,
basefrequency_sd=0
),
expand.grid(query_bp=0,
N_samples=unique(preds$N_samples),
basefrequency_sd=0
),
expand.grid(query_bp=0,
N_samples=0,
basefrequency_sd=unique(preds$basefrequency_sd)
)
)
logistic <- function(L) {
1 / (1 + exp(-L))
}
preds = predict(reduced_model, newdata=preds, interval='conf') %>%
as_tibble() %>%
mutate_all(~logistic(.x)+0.0000001) %>%
rename(confidence=fit) %>%
bind_cols(preds %>%
mutate(N_samples = N_samples*sd(genus_predictions$N_samples-1)+mean(genus_predictions$N_samples-1),
basefrequency_sd = basefrequency_sd*sd(genus_predictions$basefrequency_sd)+
mean(genus_predictions$basefrequency_sd),
query_bp = query_bp*sd(genus_predictions$query_bp)+mean(genus_predictions$query_bp))
)
combined_conf = patchwork::wrap_plots(plot_genus_N_vs_conf +
theme(text = element_text(size=8)) +
geom_line(data=(filter(preds,
N_samples != mean(genus_predictions$N_samples-1))),
color="blue") +
geom_ribbon(data=(filter(preds,
N_samples != mean(genus_predictions$N_samples-1))),
aes(ymin=lwr, ymax=upr),
fill="lightblue",
alpha=0.4),
plot_genus_bp_vs_conf + theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
text = element_text(size=8)) +
geom_line(data=(filter(preds,
query_bp != mean(genus_predictions$query_bp))),
color="blue") +
geom_ribbon(data=(filter(preds,
query_bp != mean(genus_predictions$query_bp))),
aes(ymin=lwr, ymax=upr),
fill="lightblue",
alpha=0.4),
plot_genus_freqsd_vs_conf + theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
text = element_text(size=8)) +
geom_line(data=(filter(preds,
basefrequency_sd != mean(genus_predictions$basefrequency_sd))),
color="blue") +
geom_ribbon(data=(filter(preds,
basefrequency_sd != mean(genus_predictions$basefrequency_sd))),
aes(ymin=lwr, ymax=upr),
fill="lightblue",
alpha=0.4)) +
patchwork::plot_annotation(tag_levels = 'A',
title = 'Factors affecting varKode prediction accuracy',theme = theme(plot.title = element_text(hjust=0.5)))
combined_conf
ggsave(filename = 'images_manuscript/supp_conf_predictors.pdf',device = 'pdf',width = 7,height=3,units = 'in',useDingbats=F)
For skmer, we left each sample out, built a reference and then queried that sample. We have several files in which reference samples are ordered by their distance to the query, we here we will evaluate whether the closest sample is from the correct species or genus.
Because it is not clear how skmer behaves for different levels of coverage, we repeated this for several input sizes (in number of basepairs) as query, but always used the maximum input dize available (up to 200Mb) for references.
Let’s make a function that extracts these results as a table.
samp_labels = results %>% select(sample_id,actual_labels) %>% distinct()
extract_skmer_results = function(file_path) {
# Read only the first 2 lines of the file
file_lines <- readLines(file_path, n = 2)
# Extract sample_ID, basepairs from the first line
sample_info <- str_match(file_lines[1], "\\s*(.*?)@(\\d+K)")[, 2:3]
sample_ID <- sample_info[1]
basepairs <- sample_info[2]
# Extract reference_sample_ID, distance from the second line
reference_info <- str_match(file_lines[2], "\\s*(.*?)@.*\\s+(\\d+\\.\\d+)")[, 2:3]
reference_sample_ID <- reference_info[1]
distance <- as.numeric(reference_info[2])
# Create a tibble
tibble(
sample_id = sample_ID,
query_bp = basepairs,
closest_reference_sample_id = reference_sample_ID,
closest_distance = distance
)
}
Now we will apply this function to all skmer output files.
plan(multisession(workers = 12))
skmer_results_df = furrr::future_map_dfr(
list.files('Malpighiaceae+Chrysobalanaceae/skmer/skmer_xval_results/', full.names = T),
~ extract_skmer_results(.x)
) %>%
left_join(samp_labels, by = 'sample_id') %>%
left_join(
samp_labels %>% select(
closest_reference_sample_id = 'sample_id',
predicted_labels = actual_labels
),
by = 'closest_reference_sample_id'
) %>%
mutate(
query_labels = str_remove(actual_labels, ";*low_quality:True;*") %>% str_split(';'),
predicted_list = str_split(predicted_labels, ';')
) %>%
rowwise() %>%
mutate(
family_correct = query_labels[str_detect(query_labels, 'family')] %in% predicted_list,
genus_correct = query_labels[str_detect(query_labels, 'genus')] %in% predicted_list,
species_correct = ifelse(any(str_detect(
query_labels, 'species'
)),
query_labels[str_detect(query_labels, 'species')] %in% predicted_list,
NA),
family_incorrect = any(!(predicted_list[str_detect(predicted_list, 'family')] %in% query_labels[str_detect(query_labels, 'family')])),
genus_incorrect = any(!(predicted_list[str_detect(predicted_list, 'genus')] %in% query_labels[str_detect(query_labels, 'genus')])),
species_incorrect = ifelse(any(str_detect(
query_labels, 'species'
)),
any(!(
predicted_list[str_detect(predicted_list, 'species')] %in% query_labels[str_detect(query_labels, 'species')]
)),
NA)
)
plan(sequential)
skmer_results_df
Now let’s summarize and plot by genus:
skmer_summary_genus = summarize_results(skmer_results_df,'genus')
p_skmer_genus = plot_area(skmer_summary_genus, 'Skmer genus', relative = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_skmer_genus
Now by species. In Skmer, there is no inconclusive result: if there is no correct species prediction, it means that a sample was predicted in the wrong genus and therefore it is incorrect
skmer_summary_species = summarize_results(skmer_results_df,'species') %>%
mutate(result = ifelse(result == 'correct', 'correct','incorrect')) %>%
group_by(query_bp,result) %>%
summarise_all(sum)
p_skmer_species = plot_area(skmer_summary_species, 'Skmer species', relative = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_skmer_species
And now by family:
skmer_summary_family = summarize_results(skmer_results_df,'family')
skmer_summary_family
p_skmer_family = plot_area(skmer_summary_family, 'Skmer family', relative = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_skmer_family
Let’s now read the traditional barcode BLAST results and summarize them in the same way as skmer and varKoder. Let’s start by defining a fuction that reads the data so we can summarize it using the previously defined functions.
read_traditional_barcodes = function(bp) {
input_file = paste0(
'Malpighiaceae+Chrysobalanaceae/traditional_barcodes/2_blast_phylogeny_result/Genus/',
bp,
'M_blast_phylo_sum_sp.tsv'
)
barcode_res = read_delim(input_file) %>%
pivot_longer(-sp, names_to = 'marker', values_to = 'closest_reference_sample_id') %>%
rename(sample_id = 'sp') %>%
mutate(
sample_id = str_remove_all(sample_id, '@.+'),
closest_reference_sample_id = str_remove_all(closest_reference_sample_id, '@.+'),
predicted_labels = samp_labels$actual_labels[match(closest_reference_sample_id, samp_labels$sample_id)],
actual_labels = samp_labels$actual_labels[match(sample_id, samp_labels$sample_id)]
) %>%
filter(marker != 'Concatenated_phylogeny') %>%
mutate(
query_labels = str_remove(actual_labels, ";*low_quality:True;*") %>% str_split(';'),
predicted_list = str_split(predicted_labels, ';')
) %>%
rowwise() %>%
mutate(
family_correct = query_labels[str_detect(query_labels, 'family')] %in% predicted_list,
genus_correct = query_labels[str_detect(query_labels, 'genus')] %in% predicted_list,
species_correct = ifelse(any(str_detect(
query_labels, 'species'
)),
query_labels[str_detect(query_labels, 'species')] %in% predicted_list,
NA),
family_incorrect = any(!(predicted_list[str_detect(predicted_list, 'family')] %in% query_labels[str_detect(query_labels, 'family')])),
genus_incorrect = any(!(predicted_list[str_detect(predicted_list, 'genus')] %in% query_labels[str_detect(query_labels, 'genus')])),
species_incorrect = ifelse(any(str_detect(
query_labels, 'species'
)),
any(!(
predicted_list[str_detect(predicted_list, 'species')] %in% query_labels[str_detect(query_labels, 'species')]
)),
NA)
) %>%
mutate_at(vars(ends_with("_correct"), ends_with("_incorrect")),
~ ifelse(is.na(predicted_labels) & !is.na(.), FALSE, .)) %>%
mutate(query_bp = bp * 1e3)
return(barcode_res)
}
Now we can apply this function to all of our results:
results_barcodes = purrr::map_dfr(c(0.5,1,2,5,10,20,50,100,200),read_traditional_barcodes)
Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)Rows: 288 Columns: 7── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (6): sp, matK, rbcL, ndhF, trnL-F, ITS
lgl (1): Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)Rows: 288 Columns: 7── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (6): sp, matK, rbcL, ndhF, trnL-F, ITS
lgl (1): Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)Rows: 288 Columns: 7── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (6): sp, matK, rbcL, ndhF, trnL-F, ITS
lgl (1): Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)Rows: 288 Columns: 7── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (6): sp, matK, rbcL, ndhF, trnL-F, ITS
lgl (1): Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 288 Columns: 7── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 285 Columns: 7── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 267 Columns: 7── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 200 Columns: 7── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 166 Columns: 7── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
results_barcodes
Now let’s summarise for each marker separately:
barcode_summary_family = split(results_barcodes,results_barcodes$marker) %>%
purrr::map_dfr(~summarize_results(.x,'family'),.id='marker')
barcode_summary_family
barcode_summary_genus = split(results_barcodes,results_barcodes$marker) %>%
purrr::map_dfr(~summarize_results(.x,'genus'),.id='marker')
barcode_summary_genus
barcode_summary_species = split(results_barcodes,results_barcodes$marker) %>%
purrr::map_dfr(~summarize_results(.x,'species'),.id='marker')
barcode_summary_species
Now let’s plot, making separate plots for each marker:
Species:
p_barcode_species = barcode_summary_species %>%
split(barcode_summary_species$marker) %>%
purrr::map(~plot_area(.x,paste0(unique(.x$marker),' species'), relative = TRUE, xlim_all = TRUE))
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_barcode_species
$ITS
$matK
$ndhF
$rbcL
$`trnL-F`
Genera:
p_barcode_genus = barcode_summary_genus %>%
split(barcode_summary_genus$marker) %>%
purrr::map(~plot_area(.x,paste0(unique(.x$marker),' genus'), relative = TRUE, xlim_all = TRUE))
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_barcode_genus
$ITS
$matK
$ndhF
$rbcL
$`trnL-F`
Family:
p_barcode_family = barcode_summary_family %>%
split(barcode_summary_family$marker) %>%
purrr::map(~plot_area(.x,paste0(unique(.x$marker),' family'), relative = TRUE,xlim_all = TRUE))
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_barcode_family
$ITS
$matK
$ndhF
$rbcL
$`trnL-F`
Now we will do the same for concatenated tree. Let’s start by defining a function to gather results. We will consider a result as correct if the majority of the sister taxon to a tip has the same label.
read_concatenated_tree_results = function(bp){
# Read in your tree - replace 'your_tree_file.nwk' with the path to your tree file
tree = read.tree(paste0('Malpighiaceae+Chrysobalanaceae/traditional_barcodes/2_blast_phylogeny_result/Genus/conc.',bp,'m.spname.tre'))
#leave only sample IDs as tip labels
tree$tip.label = tree$tip.label %>% str_remove(".*@") %>% str_remove("'") %>% str_replace(' ref','_ref')
# Compute the patristic distances and list all reference names
patristic_distances <- cophenetic(tree)
all_ref_names = dimnames(patristic_distances)[[1]][str_detect(dimnames(patristic_distances)[[1]],'_ref$')]
all_nonref = dimnames(patristic_distances)[[1]][str_detect(dimnames(patristic_distances)[[1]],'_ref$',negate = TRUE)]
# For each tip, find the reference sample with closest patristic distance
find_closest = function(tip){
to_keep = c(tip,all_ref_names[str_detect(all_ref_names,paste0(tip,'_ref'),negate = TRUE)])
return(names(sort(patristic_distances[tip,to_keep])[2]) %>%
str_remove('_ref'))
}
closest_match = purrr::map_chr(all_nonref,find_closest)
samples_with_data = read_delim(paste0('Malpighiaceae+Chrysobalanaceae/traditional_barcodes/2_blast_phylogeny_result/Genus/',bp,'M_blast_phylo_sum_sp.tsv')) %>%
select(sample_id=sp) %>%
mutate(sample_id = str_remove_all(sample_id, '@.+'))
barcode_res = tibble(sample_id = all_nonref,
closest_reference_sample_id = closest_match) %>%
right_join(samples_with_data) %>%
mutate(
predicted_labels = samp_labels$actual_labels[match(closest_reference_sample_id, samp_labels$sample_id)],
actual_labels = samp_labels$actual_labels[match(sample_id, samp_labels$sample_id)]
) %>%
filter(sample_id!='2095') %>%
mutate(
query_labels = str_remove(actual_labels, ";*low_quality:True;*") %>% str_split(';'),
predicted_list = str_split(predicted_labels, ';')
) %>%
rowwise() %>%
mutate(
family_correct = query_labels[str_detect(query_labels, 'family')] %in% predicted_list,
genus_correct = query_labels[str_detect(query_labels, 'genus')] %in% predicted_list,
species_correct = ifelse(any(str_detect(
query_labels, 'species'
)),
query_labels[str_detect(query_labels, 'species')] %in% predicted_list,
NA),
family_incorrect = any(!(predicted_list[str_detect(predicted_list, 'family')] %in% query_labels[str_detect(query_labels, 'family')])),
genus_incorrect = any(!(predicted_list[str_detect(predicted_list, 'genus')] %in% query_labels[str_detect(query_labels, 'genus')])),
species_incorrect = ifelse(any(str_detect(
query_labels, 'species'
)),
any(!(
predicted_list[str_detect(predicted_list, 'species')] %in% query_labels[str_detect(query_labels, 'species')]
)),
NA)
) %>%
mutate_at(vars(ends_with("_correct"), ends_with("_incorrect")),
~ ifelse(is.na(predicted_labels) & !is.na(.), FALSE, .)) %>%
mutate(query_bp = bp * 1e3)
return(barcode_res)
}
Now let’s apply this function
results_concat_barcodes = purrr::map_dfr(c(0.5,1,2,5,10,20,50,100,200),read_concatenated_tree_results)
Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)Rows: 288 Columns: 7── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (6): sp, matK, rbcL, ndhF, trnL-F, ITS
lgl (1): Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)Rows: 288 Columns: 7── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (6): sp, matK, rbcL, ndhF, trnL-F, ITS
lgl (1): Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)Rows: 288 Columns: 7── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (6): sp, matK, rbcL, ndhF, trnL-F, ITS
lgl (1): Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)Rows: 288 Columns: 7── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (6): sp, matK, rbcL, ndhF, trnL-F, ITS
lgl (1): Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Rows: 288 Columns: 7── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Rows: 285 Columns: 7── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Rows: 267 Columns: 7── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Rows: 200 Columns: 7── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Rows: 166 Columns: 7── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`
results_concat_barcodes
Let’s summarize results and plot for genus, species and family accuracy
concat_summary_species = summarize_results(results_concat_barcodes,'species')
p_concat_species = plot_area(concat_summary_species, relative = FALSE,title = 'Concatenated barcodes species',xlim_all = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_concat_species
concat_summary_genus = summarize_results(results_concat_barcodes,'genus')
p_concat_genus = plot_area(concat_summary_genus, relative = TRUE,title = 'Concatenated barcodes genus',xlim_all = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_concat_genus
concat_summary_family = summarize_results(results_concat_barcodes,'family')
p_concat_family = plot_area(concat_summary_family, relative = TRUE,title = 'Concatenated barcodes family',xlim_all = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_concat_family
Now let’s compare methods side by side. For genus level:
p = patchwork::wrap_plots(p_genus + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_skmer_genus + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_barcode_genus$ITS + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_barcode_genus$rbcL + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_concat_genus,
ncol = 1) +
plot_annotation(title = 'Genus-level accuracy')
p
ggsave('images_manuscript/fig3_genus_accuracy.pdf', width=5,height = 10)
ggsave('images_manuscript/fig3_genus_accuracy.png', width=5,height = 10,dpi=1200)
Now for species level:
p = patchwork::wrap_plots(p_species + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_skmer_species + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_barcode_species$ITS + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_barcode_species$rbcL + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_concat_species,
ncol = 1) +
plot_annotation(title = 'species-level accuracy')
p
ggsave('images_manuscript/fig3_species_accuracy.pdf', width=5,height = 10)
ggsave('images_manuscript/fig3_species_accuracy.png', width=5,height = 10,dpi=1200)
Now for family level:
p = patchwork::wrap_plots(p_family + ggtitle('varKoder') + theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.position = 'none'),
p_skmer_family + ggtitle('Skmer') + theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.position = 'none'),
p_barcode_family$ITS + ggtitle('ITS') + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_barcode_family$rbcL + ggtitle('rbcL') + theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.position = 'none'),
p_concat_family + ggtitle('Concatenated conventional barcodes') + theme(legend.position = 'none'),
ncol = 1,guides = 'collect') +
plot_annotation(title = 'Family-level accuracy',theme = theme(plot.title=element_text(hjust=0.5)))
p
ggsave('images_manuscript/fig3_family_accuracy.pdf', width=5,height = 10)
ggsave('images_manuscript/fig3_family_accuracy.png', width=5,height = 10,dpi=1200)
Now let’s plot a figure for the other traditional barcode loci that did not make this list.
p1 = patchwork::wrap_plots(ggplot() +
theme_minimal() +
ggtitle("Species") +
theme(plot.title = element_text(hjust=0.5),
plot.margin = unit(c(0,0,0,0),'lines')),
p_barcode_species$matK +
ggtitle('matK') +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.title.y =element_blank(),
legend.position = 'none'),
p_barcode_species$ndhF +
ggtitle('ndhF') +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.title.y =element_blank(),
legend.position = 'none'),
p_barcode_species$`trnL-F` +
ggtitle('trnL-F') +
theme(axis.title.y =element_blank(),
legend.position = 'none',
axis.title.x =element_blank()),
ncol = 1,
heights = c(1,15,15,15)
)
p2 = patchwork::wrap_plots(ggplot() +
theme_minimal() +
ggtitle("Genus") +
theme(plot.title = element_text(hjust=0.5),
plot.margin = unit(c(0,0,0,0),'lines')),
p_barcode_genus$matK +
ggtitle('matK') +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.title.y =element_blank(),
axis.text.y =element_blank(),
legend.position = 'none'),
p_barcode_genus$ndhF +
ggtitle('ndhF') +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.title.y =element_blank(),
axis.text.y =element_blank(),
legend.position = 'none'),
p_barcode_genus$`trnL-F` +
ggtitle('trnL-F') +
theme(axis.title.y =element_blank(),
axis.text.y =element_blank(),
legend.position = 'none',
axis.title.x =element_blank()),
ncol = 1,
heights = c(1,15,15,15))
p3 = patchwork::wrap_plots(ggplot() +
theme_minimal() +
ggtitle("Family") +
theme(plot.title = element_text(hjust=0.5),
plot.margin = unit(c(0,0,0,0),'lines')),
p_barcode_family$matK +
ggtitle('matK') +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.title.y =element_blank(),
axis.text.y =element_blank(),
legend.position = 'none'),
p_barcode_family$ndhF +
ggtitle('ndhF') +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.title.y =element_blank(),
axis.text.y =element_blank(),
legend.position = 'none'),
p_barcode_family$`trnL-F` +
ggtitle('trnL-F') +
theme(axis.title.y =element_blank(),
axis.text.y =element_blank(),
axis.title.x =element_blank()),
ncol = 1,
heights = c(1,15,15,15),
guides='collect')
p = patchwork::wrap_plots(p1,p2,p3,ncol=3,guides="collect") +
plot_annotation(
title = 'Conventional barcode accuracy across different taxonomic levels',
theme = theme(plot.title = element_text(hjust = 0.2,face='bold',size=15))
)
ggsave('images_manuscript/supp_traditional_barcodes.pdf', plot = p, width=8,height = 6)
Finally, let’s summarize results for the whole SRA dataset. In this case, we only have varKoder since Skmer cannot finish and traditional barcodes are inapplicable. We have done predictions separately for families included in the training set and families not included in the trainings set, so we will load each one and concatenate
varKoder_SRA_results = read_csv('all_SRA/varkoder_query_results/predictions.csv') %>%
select(-1) %>%
filter(str_detect(query_basepairs,'^0+[125]0+K$')) %>% #we will ignore queries that are not standardized sizes
rename(query_bp = query_basepairs) %>%
mutate(quality_included = T)
New names:Rows: 8607 Columns: 873── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): varKode_image_path, sample_id, query_basepairs, query_kmer_len, trained_model_path, prediction_type...
dbl (865): ...1, prediction_threshold, actual_labels, basefrequency_sd, 10066, 101087, 10135, 10167, 10193, 10...
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
plan(sequential)
varKoder_SRA_results = varKoder_SRA_results %>%
mutate(query_labels = str_remove(actual_labels,";*low_quality:True;*") %>% str_split(';') %>% unlist,
predicted_list = str_split(predicted_labels,';')
) %>%
rowwise() %>%
mutate(family_correct = query_labels %in% predicted_list,
family_incorrect = ifelse(is.na(predicted_labels),FALSE,any(!(predicted_list %in% query_labels))),
family_in_training = TRUE) %>%
select(matches("^[^0-9]"))
varKoder_SRA_results
NA
varKoder_SRA_results_notincluded = read_csv('all_SRA/varkoder_query_notincluded_results/predictions.csv') %>%
select(-1) %>%
filter(str_detect(query_basepairs,'^0+[125]0+K$')) %>% #we will ignore queries that are not standardized sizes
rename(query_bp = query_basepairs) %>%
mutate(quality_included = T)
New names:Rows: 8439 Columns: 873── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): varKode_image_path, sample_id, query_basepairs, query_kmer_len, trained_model_path, prediction_type...
dbl (865): ...1, prediction_threshold, actual_labels, basefrequency_sd, 10066, 101087, 10135, 10167, 10193, 10...
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
plan(sequential)
SRA_taxlabels_notincluded = str_remove(varKoder_SRA_results_notincluded$actual_labels,";*low_quality:True;*") %>% str_split(';') %>% unlist %>% unique
varKoder_SRA_results_notincluded = varKoder_SRA_results_notincluded %>%
mutate(query_labels = str_remove(actual_labels,";*low_quality:True;*") %>% str_split(';') %>% unlist,
predicted_list = str_split(predicted_labels,';')
) %>%
rowwise() %>%
mutate(family_correct = query_labels %in% predicted_list,
family_incorrect = ifelse(is.na(predicted_labels),FALSE,any(!(predicted_list %in% query_labels))),
family_in_training = FALSE) %>%
select(matches("^[^0-9]"))
varKoder_SRA_results_notincluded
Now let’s summarize and plot:
SRA_summary_family = bind_rows(summarize_results(varKoder_SRA_results,'family') %>% mutate(family_in_training = TRUE),
summarize_results(varKoder_SRA_results_notincluded,'family') %>% mutate(family_in_training = FALSE))
SRA_summary_family
N_samp = SRA_summary_family %>%
group_by(query_bp, family_in_training) %>%
summarise(N = sum(N))
`summarise()` has grouped output by 'query_bp'. You can override using the `.groups` argument.
p_SRA_family = plot_area(SRA_summary_family, 'varKoder SRA family', relative = TRUE,xlim_all = FALSE, wrap = '~family_in_training')
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_SRA_family
Let’s now do the SRA plot, but splitting by kingdom and whether or not family was included in trainings. First, we need to retrieve kingdom information:
summary_SRA_by_kingdom = read_csv('all_SRA/runs_to_download_data.csv') %>%
select(sample_id = Run, Kingdom) %>%
right_join(varKoder_SRA_results) %>%
split(.$Kingdom) %>%
purrr::map_df(summarize_results,
level='family',
.id='Kingdom'
) %>%
mutate(Kingdom = factor(Kingdom,levels=c('Metazoa','Viridiplantae','Fungi'),ordered = T),
family_in_training = T) %>%
bind_rows(read_csv('all_SRA/runs_notincluded_to_download_data.csv') %>%
select(sample_id = Run, Kingdom) %>%
right_join(varKoder_SRA_results_notincluded) %>%
split(.$Kingdom) %>%
purrr::map_df(summarize_results,
level='family',
.id='Kingdom'
) %>%
mutate(Kingdom = factor(Kingdom,levels=c('Metazoa','Viridiplantae','Fungi'),ordered = T),
family_in_training = F)) %>%
mutate(family_in_training = c('Family\nnot in training set', 'Family\nin training set')[family_in_training+1])
Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)Rows: 8264 Columns: 51── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (28): Run, AssemblyName, download_path, Experiment, LibraryName, LibraryStrategy, LibrarySelection, Libra...
dbl (11): spots, bases, spots_with_mates, avgLength, size_MB, InsertSize, InsertDev, Study_Pubmed_id, Project...
lgl (10): g1k_pop_code, source, g1k_analysis_group, Subject_ID, Disease, Affection_Status, Analyte_Type, Hist...
dttm (2): ReleaseDate, LoadDate
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)Rows: 1697 Columns: 51── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (27): Run, download_path, Experiment, LibraryName, LibraryStrategy, LibrarySelection, LibrarySource, Libr...
dbl (11): spots, bases, spots_with_mates, avgLength, size_MB, InsertSize, InsertDev, Study_Pubmed_id, Project...
lgl (11): AssemblyName, g1k_pop_code, source, g1k_analysis_group, Subject_ID, Disease, Affection_Status, Anal...
dttm (2): ReleaseDate, LoadDate
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`
summary_SRA_by_kingdom
p_SRA_families = plot_area(summary_SRA_by_kingdom ,
relative=FALSE,
xlim_all = FALSE,
title='Eukaryote families') +
facet_grid(family_in_training~Kingdom) +
coord_cartesian(xlim=c(500,10000)*1000,expand = FALSE) +
theme(text = element_text(size=10))
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Coordinate system already present. Adding new coordinate system, which will replace the existing one.
print(p_SRA_families)
ggsave('images_manuscript/fig3_SRA_accuracy.pdf', width=5,height = 4)
ggsave('images_manuscript/fig3_SRA_accuracy.png', width=5,height = 4,dpi = 1200)
Now we will make a small figure to include the additional datasets in which we applied varKoding.
In these cases, we chose a test set that included both taxa in the
training set and taxa not in the training set, so we will graph both
separately. This is denoted by a column named
in_training_model. Let’s start by reading results.
Let’s define a function to read and process predictions:
read_and_process_others = function(infile){
varkoder_results = read_csv(infile) %>%
mutate(sample_id = as.character(sample_id)) %>%
select(-1) %>%
rename(query_bp = query_basepairs)
all_taxlabels = str_remove(varkoder_results$actual_labels,";*low_quality:True;*") %>% str_split(';') %>% unlist %>% unique
varkoder_results = varkoder_results %>%
mutate(query_labels = str_remove(actual_labels,";*low_quality:True;*") %>% str_split(';'),
predicted_list = str_split(predicted_labels,';')
) %>%
rowwise() %>%
mutate(taxon_correct = any(query_labels %in% predicted_list),
taxon_incorrect = any(!(predicted_list[!is.na(predicted_list)] %in% query_labels))
)
return(varkoder_results)
}
Now let’s apply this function to all files.
prediction_files = list.files('other_datasets',pattern = 'prediction.+csv',full.names = T)
names(prediction_files) = basename(prediction_files) %>% str_extract(".*(?=_prediction_table\\.csv)")
other_results = purrr::map_dfr(prediction_files, read_and_process_others, .id='dataset')
Rows: 18 Columns: 16── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): sample_id, query_basepairs, query_kmer_len, prediction_type, in_training_model, predicted_labels, act...
dbl (8): Bembidion, a, basefrequency_sd, ampliatum, breve, lividulum, saturatum, testatum
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 18 Columns: 16── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): sample_id, query_basepairs, query_kmer_len, prediction_type, in_training_model, predicted_labels, act...
dbl (8): Corallorhiza, prediction_threshold, basefrequency_sd, Corallorhiza bentleyi, Corallorhiza striata, Co...
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 25 Columns: 16── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): sample_id, query_basepairs, query_kmer_len, prediction_type, in_training_model, predicted_labels, act...
dbl (8): Mycobacterium tuberculosis, prediction_threshold, basefrequency_sd, 1.2.2.1, 2.2.1.1.1, 3.1.2, 4.1.i1...
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 15 Columns: 16── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): sample_id, query_basepairs, query_kmer_len, prediction_type, in_training_model, predicted_labels, act...
dbl (8): Xanthoparmelia, prediction_threshold, basefrequency_sd, camtschadalis, chlorochroa, coloradoensis, me...
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
other_results
Let’s now summarize by dataset and separately for taxa included and excluded from the training set.
summary_others = other_results %>%
split(interaction(other_results$dataset, other_results$in_training_model)) %>%
purrr::map_dfr(summarize_results, level = 'taxon', .id = 'comb') %>%
separate(comb, into = c("dataset", "taxon_in_training_raw"), sep = "\\.") %>%
mutate(taxon_in_training = taxon_in_training_raw == 'yes') %>%
select(-taxon_in_training_raw) %>%
mutate(taxon_in_training = c('Taxon not in training set', 'Taxon in training set')[taxon_in_training+1],
dataset = str_replace(dataset, "^(.)", ~toupper(.x))) %>%
mutate(result = factor(result,
levels=c("correct", "ambiguous", "inconclusive", "incorrect"),
ordered=T))
summary_others
NA
Now let’s plot
p_others = ggplot(summary_others , aes(x = dataset, y = N, fill = result)) +
geom_col()+
scale_fill_manual(values = setNames(RColorBrewer::brewer.pal(4, "Accent"), c("correct", "ambiguous", "inconclusive", "incorrect"))) +
scale_alpha_manual(values=c(0.5,1)) +
ggtitle('Other datasets') +
ylab('Number of samples') +
xlab('Taxon') +
theme_few() +
scale_y_continuous(minor_breaks = waiver()) +
theme(panel.background = element_rect(fill = NA),
panel.grid.major.y = element_line(colour = gray(0.5)),
panel.grid.minor.y = element_line(colour = gray(0.6),linetype = 2),
axis.title.x = element_blank(),
axis.text.x = element_text(face='italic'),
panel.ontop = TRUE) +
coord_cartesian(expand=FALSE) +
facet_grid(taxon_in_training~.) +
theme(text = element_text(size=10),
axis.text.x = element_text(angle = 30,hjust = 1))
p_others
ggsave('images_manuscript/fig3_others_accuracy.pdf', width=5,height = 3)
ggsave('images_manuscript/fig3_others_accuracy.png', width=5,height = 3,dpi = 1200)
Here we just query our results to get a few figures that we report in the paper.
Total number of samples used in cross-validation:
dim(samp_labels)
[1] 287 2
Number of Stigmaphyllon samples with each kind of error for varkoder:
summary_species
Number of Stigmaphyllon samples with each kind of error for skmer:
skmer_summary_species
Traditional barcode accuracy for species:
barcode_summary_species %>% arrange(query_bp,marker)
Concatenated barcode accuract for species:
concat_summary_species
varKoder accuracy for genera:
summary_genus
varKoder accuracy for family:
summary_family
Skmer accuracy for genera:
skmer_summary_genus
Skmer accuracy for family:
skmer_summary_family
Number of samples available for each genus and data amount
results %>%
mutate(genus = str_extract(actual_labels,"(?<=genus:)[^;]+")) %>%
group_by(query_bp) %>%
summarize(N=n()) %>%
complete()
Plot number of samples for supplementary material.
n_samples_genera = results %>%
mutate(taxon = str_extract(actual_labels,"(?<=genus:)[^;]+")) %>%
group_by(taxon, query_bp) %>%
summarize(N=n()) %>%
ungroup() %>%
complete(taxon, query_bp, fill = list(N=0)) %>%
mutate(taxon = fct_reorder(taxon, N))
`summarise()` has grouped output by 'taxon'. You can override using the `.groups` argument.
n_samples_genera
n_samples_species = results %>%
mutate(taxon = str_extract(actual_labels,"(?<=species:)[^;]+")) %>%
filter(!is.na(taxon)) %>%
group_by(taxon, query_bp) %>%
summarize(N=n()) %>%
ungroup() %>%
complete(taxon, query_bp, fill = list(N=0)) %>%
mutate(taxon = fct_reorder(taxon, N))
`summarise()` has grouped output by 'taxon'. You can override using the `.groups` argument.
n_samples_species
For SRA, we have to count both validation and training samples, since we did not do cross-validation. Let’s use image names to get the information and then the results table to figure out which ones were in the validation set.
all_files = c(list.files('all_SRA/varkoder_images_SRA/',pattern='*.png',recursive = T),
list.files('all_SRA/varkoder_query_images/',pattern='*.png',recursive = T))
n_samples_SRA = data.frame(filename=all_files) %>%
mutate(
sample_id = str_extract(filename, "^(.+)(?=@)"), # Capture everything up to the "@" symbol but exclude the symbol itself
query_bp = str_extract(filename, "(?<=@)([0-9]+)K") # multiply by 1000 to convert K to the actual number
) %>%
left_join(read_csv('all_SRA/runs_to_download_data.csv') %>%
select(sample_id=Run,Kingdom,taxon=FamilyID)) %>%
mutate_at(vars(taxon),as.character) %>%
mutate(validation_set = sample_id %in% varKoder_SRA_results$sample_id) %>%
group_by(taxon, query_bp,validation_set) %>%
summarize(N=n()) %>%
ungroup() %>%
mutate(taxon = fct_reorder(taxon, N))
Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)Rows: 8264 Columns: 51── Column specification ──────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (28): Run, AssemblyName, download_path, Experiment, LibraryName, LibraryStrategy, LibrarySelection, Libra...
dbl (11): spots, bases, spots_with_mates, avgLength, size_MB, InsertSize, InsertDev, Study_Pubmed_id, Project...
lgl (10): g1k_pop_code, source, g1k_analysis_group, Subject_ID, Disease, Affection_Status, Analyte_Type, Hist...
dttm (2): ReleaseDate, LoadDate
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)``summarise()` has grouped output by 'taxon', 'query_bp'. You can override using the `.groups` argument.
n_samples_SRA
((list.files('all_SRA/varkoder_images_SRA/',pattern='*.png',recursive = T) %>%
str_extract("^(.+)(?=@)"))%in%
varKoder_SRA_results$sample_id) %>% summary
Mode FALSE TRUE
logical 32496 8607
plot_Nsamples_area = function(df, title){
df = df %>%
mutate(query_bp = parse_number(query_bp) *1000)
n_levels <- length(unique(df$taxon))
viridis_colors <- viridis::turbo(n_levels)
half_n <- ceiling(n_levels / 2)
reordered_colors <- c(rbind(viridis_colors[1:half_n], viridis_colors[(half_n + 1):n_levels]))
ggplot(df, aes(x=query_bp,y=N,fill=taxon, color = taxon, group = taxon)) +
geom_area(position= position_stack()) +
#geom_line(position='stack') +
scale_fill_manual(values = reordered_colors,
aesthetics = c('colour','fill'),
guide = 'none') +
scale_x_log10(labels = scales::label_number(scale_cut = scales::cut_si('bp')),
breaks = 1000*parse_number(unique(n_samples_genera$query_bp)),
limits = 1000*range(parse_number(unique(n_samples_genera$query_bp)))) +
scale_y_continuous(n.breaks = 10, minor_breaks = waiver()) +
ggtitle(title) +
ylab('Number of samples') +
xlab('Base pairs in query images') +
theme_few() +
theme(axis.text.x = element_text(hjust=1,angle=45),
panel.background = element_rect(fill = NA),
panel.grid.major.y = element_line(colour = gray(0.5)),
panel.grid.minor.y = element_line(colour = gray(0.6),linetype = 2),
panel.ontop = TRUE)
}
N_species = plot_Nsamples_area(n_samples_species, title = expression(italic('Stigmaphyllon')~'species')) + theme(axis.title.x = element_blank(),text = element_text(size=8))
N_genera = plot_Nsamples_area(n_samples_genera, title = 'Malpighiales genera') + theme(axis.title.x = element_blank(),text = element_text(size=8))
p = plot_grid(N_species, N_genera, nrow = 1)
# Add common plot title with white background
common_title = ggdraw() + draw_label("Number of samples available for different data amounts", fontface = 'bold', x = 0.5, hjust = 0.5) + theme(plot.background = element_rect(fill = "white", color = "white"), plot.margin = unit(c(0, 0, 0, 0), "null"))
p = plot_grid(common_title, p, ncol = 1, rel_heights = c(0.1, 1))
# Add common X-axis title with white background
x_axis_title = ggdraw() + draw_label("Post-cleaning base pairs available", x = 0.5, hjust = 0.5, vjust = 1) + theme(plot.background = element_rect(fill = "white", color = "white"), plot.margin = unit(c(-1, 0, 0, 0), "lines"))
p = plot_grid(p, x_axis_title, ncol = 1, rel_heights = c(1, 0.1))
print(p)
ggsave('images_manuscript/supp_fig_n_samples.pdf', width=8,height = 4)
ggsave('images_manuscript/supp_fig_n_samples.png', width=8,height = 4,dpi = 1200)
Total number of SRA samples. Validation:
```r
read_csv('varKoder/all_SRA/varkoder_trained_model_ML/input_data.csv')[-1] %>%
group_by(is_valid) %>%
summarise(N = n())
<!-- rnb-source-end -->
<!-- rnb-output-begin eyJkYXRhIjoiRXJyb3I6ICd2YXJLb2Rlci9hbGxfU1JBL3ZhcmtvZGVyX3RyYWluZWRfbW9kZWxfTUwvaW5wdXRfZGF0YS5jc3YnIGRvZXMgbm90IGV4aXN0IGluIGN1cnJlbnQgd29ya2luZyBkaXJlY3RvcnkgKCcvVXNlcnMvYnJ1bm8vRG9jdW1lbnRzL2RvY3NfbWFjYm9va2FpcjIwMTUvcGFwZXJzL3dvcmtpbmcvQ05OX3NwZGVsaW0vdmFyS29kZXJfZGV2ZWxvcG1lbnQvdmFyS29kZXJfdGVzdHMnKS5cbiJ9 -->
Error: ‘varKoder/all_SRA/varkoder_trained_model_ML/input_data.csv’ does not exist in current working directory (‘/Users/bruno/Documents/docs_macbookair2015/papers/working/CNN_spdelim/varKoder_development/varKoder_tests’).
<!-- rnb-output-end -->
<!-- rnb-chunk-end -->
<!-- rnb-text-begin -->
Calculate precision and recall.
<!-- rnb-text-end -->
<!-- rnb-chunk-begin -->
<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuXG5jYWxjdWxhdGVfcHJlY2lzaW9uX3JlY2FsbCA8LSBmdW5jdGlvbihyZXN1bHRzLCB0YXhvbm9taWNfbGV2ZWw9TlVMTCkge1xuICAjIEZpbHRlciBsYWJlbHMgZm9yIGEgZ2l2ZW4gdGF4b25vbWljIGxldmVsXG4gIGZpbHRlcl9sYWJlbHMgPC0gZnVuY3Rpb24obGFiZWxzX2xpc3QsIGxldmVsKSB7XG4gICAgaWYgKGlzLm51bGwobGV2ZWwpKSB7XG4gICAgICByZXR1cm4obGFiZWxzX2xpc3QpXG4gICAgfSBlbHNlIHtcbiAgICAgIHJldHVybihncmVwKHBhc3RlMChcIl5cIiwgbGV2ZWwsIFwiOlwiKSwgbGFiZWxzX2xpc3QsIHZhbHVlID0gVFJVRSkpXG4gICAgfVxuICB9XG5cblxuICAjIEluaXRpYWxpemUgdmVjdG9ycyB0byBzdG9yZSBwcmVjaXNpb24gYW5kIHJlY2FsbFxuICBwcmVjaXNpb25fdmFsdWVzIDwtIG51bWVyaWMobnJvdyhyZXN1bHRzKSlcbiAgcmVjYWxsX3ZhbHVlcyA8LSBudW1lcmljKG5yb3cocmVzdWx0cykpXG5cbiAgIyBQcm9jZXNzIGVhY2ggcm93XG4gIGZvciAoaSBpbiBzZXFfbGVuKG5yb3cocmVzdWx0cykpKSB7XG4gICAgcXVlcnlfbGFiZWxzIDwtIGZpbHRlcl9sYWJlbHMocmVzdWx0cyRxdWVyeV9sYWJlbHNbW2ldXSwgdGF4b25vbWljX2xldmVsKVxuICAgIHByZWRpY3RlZF9sYWJlbHMgPC0gZmlsdGVyX2xhYmVscyhyZXN1bHRzJHByZWRpY3RlZF9saXN0W1tpXV0sIHRheG9ub21pY19sZXZlbClcblxuICAgIHRydWVfcG9zaXRpdmVzIDwtIHN1bShwcmVkaWN0ZWRfbGFiZWxzICVpbiUgcXVlcnlfbGFiZWxzKVxuICAgICMgT25seSBjb3VudCBub24tTkEgYW5kIG5vbi1lbXB0eSBwcmVkaWN0ZWQgbGFiZWxzIGZvciBmYWxzZSBwb3NpdGl2ZXNcbiAgICBmYWxzZV9wb3NpdGl2ZXMgPC0gc3VtKCFwcmVkaWN0ZWRfbGFiZWxzICVpbiUgcXVlcnlfbGFiZWxzICYgIWlzLm5hKHByZWRpY3RlZF9sYWJlbHMpICYgcHJlZGljdGVkX2xhYmVscyAhPSBcIlwiKVxuXG4gICAgIyBDb3VudCBmYWxzZSBuZWdhdGl2ZXMgY29uc2lkZXJpbmcgbm9uLU5BIGFuZCBub24tZW1wdHkgYWN0dWFsIGxhYmVsc1xuICAgIGZhbHNlX25lZ2F0aXZlcyA8LSBzdW0oIXF1ZXJ5X2xhYmVscyAlaW4lIHByZWRpY3RlZF9sYWJlbHMgJiAhaXMubmEocXVlcnlfbGFiZWxzKSAmIHF1ZXJ5X2xhYmVscyAhPSBcIlwiKVxuXG5cbiAgICBwcmVjaXNpb25fdmFsdWVzW2ldIDwtIGlmZWxzZSgodHJ1ZV9wb3NpdGl2ZXMgKyBmYWxzZV9wb3NpdGl2ZXMpID4gMCwgXG4gICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgdHJ1ZV9wb3NpdGl2ZXMgLyAodHJ1ZV9wb3NpdGl2ZXMgKyBmYWxzZV9wb3NpdGl2ZXMpLCBcbiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBOQV9yZWFsXylcbiAgICByZWNhbGxfdmFsdWVzW2ldIDwtIGlmZWxzZSgodHJ1ZV9wb3NpdGl2ZXMgKyBmYWxzZV9uZWdhdGl2ZXMpID4gMCwgXG4gICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgdHJ1ZV9wb3NpdGl2ZXMgLyAodHJ1ZV9wb3NpdGl2ZXMgKyBmYWxzZV9uZWdhdGl2ZXMpLCBcbiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBOQV9yZWFsXylcbiAgfVxuXG4gICMgQ2FsY3VsYXRlIG1lYW4gcHJlY2lzaW9uIGFuZCByZWNhbGxcbiAgbWVhbl9wcmVjaXNpb24gPC0gbWVhbihwcmVjaXNpb25fdmFsdWVzLCBuYS5ybSA9IFRSVUUpXG4gIG1lYW5fcmVjYWxsIDwtIG1lYW4ocmVjYWxsX3ZhbHVlcywgbmEucm0gPSBUUlVFKVxuXG4gIHJldHVybih0aWJibGUodGF4b25vbWljX2xldmVsID0gdGF4b25vbWljX2xldmVsLCBwcmVjaXNpb24gPSBtZWFuX3ByZWNpc2lvbiwgcmVjYWxsID0gbWVhbl9yZWNhbGwpKVxufVxuXG5gYGAifQ== -->
```r
calculate_precision_recall <- function(results, taxonomic_level=NULL) {
# Filter labels for a given taxonomic level
filter_labels <- function(labels_list, level) {
if (is.null(level)) {
return(labels_list)
} else {
return(grep(paste0("^", level, ":"), labels_list, value = TRUE))
}
}
# Initialize vectors to store precision and recall
precision_values <- numeric(nrow(results))
recall_values <- numeric(nrow(results))
# Process each row
for (i in seq_len(nrow(results))) {
query_labels <- filter_labels(results$query_labels[[i]], taxonomic_level)
predicted_labels <- filter_labels(results$predicted_list[[i]], taxonomic_level)
true_positives <- sum(predicted_labels %in% query_labels)
# Only count non-NA and non-empty predicted labels for false positives
false_positives <- sum(!predicted_labels %in% query_labels & !is.na(predicted_labels) & predicted_labels != "")
# Count false negatives considering non-NA and non-empty actual labels
false_negatives <- sum(!query_labels %in% predicted_labels & !is.na(query_labels) & query_labels != "")
precision_values[i] <- ifelse((true_positives + false_positives) > 0,
true_positives / (true_positives + false_positives),
NA_real_)
recall_values[i] <- ifelse((true_positives + false_negatives) > 0,
true_positives / (true_positives + false_negatives),
NA_real_)
}
# Calculate mean precision and recall
mean_precision <- mean(precision_values, na.rm = TRUE)
mean_recall <- mean(recall_values, na.rm = TRUE)
return(tibble(taxonomic_level = taxonomic_level, precision = mean_precision, recall = mean_recall))
}
Precision and recall for species:
filter(results,str_detect(actual_labels,'species')) %>%
split(.$query_bp) %>%
map_dfr(~calculate_precision_recall(.x,'species'),.id='query_bp')
Precision and recall for genera:
filter(results,str_detect(actual_labels,'genus')) %>%
split(.$query_bp) %>%
map_dfr(~calculate_precision_recall(.x,'genus'),.id='query_bp')
Precision and recall for families:
filter(results,str_detect(actual_labels,'family')) %>%
split(.$query_bp) %>%
map_dfr(~calculate_precision_recall(.x,'family'),.id='query_bp')
Precision and recall for SRA families:
varKoder_SRA_results %>%
split(.$query_bp) %>%
map_dfr(~calculate_precision_recall(.x),.id='query_bp')
Precision and recall for other taxa